import os
import matplotlib as mp
import matplotlib.pyplot as plt#-------------------------Importing pyplot as plt
import pandas as pd
from matplotlib import pyplot as plt#--------------------Importing pyplot as plt ------ Can write in this fashion too.
from datetime import timedelta
import seaborn as sns
plt.style.use('seaborn')
%matplotlib inline
os.getcwd() # show's the current working directory
os.chdir("data")
print("""changing the directory to the location where the data-set is available so that now we can read the file""")
os.getcwd()
strava = pd.read_csv('strava_export.csv', index_col='date', parse_dates=True) #making the date column as index
print(strava.head(2))
strava.index = strava.index.tz_convert('Australia/Sydney')
strava.head()
strava.shape
cheetah = pd.read_csv('cheetah.csv', skipinitialspace=True)
cheetah.head()
cheetah.index = pd.to_datetime(cheetah['date'] + ' ' + cheetah['time'])
cheetah.index = cheetah.index.tz_localize('Australia/Sydney')
cheetah.head()
print("The shape of strava table is rows, column",strava.shape)
print(strava.get_dtype_counts())
print("\n")
print("getting the datatype for all columns", strava.dtypes)
print("The shape of Cheetah table is rows, column",cheetah.shape)
print(cheetah.get_dtype_counts())
print("\n")
print("getting the datatype for all columns", cheetah.dtypes)
cheetah_strava_innerjoin = pd.merge(left = cheetah, right = strava, left_index = True, right_index= True, how='inner')
print("The new shape of the joined table is :",cheetah_strava_innerjoin.shape)
cheetah_strava_innerjoin.head()
print("cheeta_strava_innerjoin-count---------------------------------------------------------------")
total_row_count = cheetah_strava_innerjoin.count()
print(total_row_count.head(5))
print("strava-count--------------------------------------------------------------------------------")
total_row_count1 = strava.count()
print(total_row_count1.head(5))
print("cheetah-count-------------------------------------------------------------------------------")
total_row_count2 = cheetah.count()
print(total_row_count2.head(5))
cheetah_strava_innerjoin['device_watts'].head()
cheetah_strava_innerjoin = cheetah_strava_innerjoin[cheetah_strava_innerjoin.device_watts != False]
print("The Shape of the joined table after droping rows where device_watts is False", cheetah_strava_innerjoin.shape)
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
cheetah_strava_innerjoin['time'] = pd.to_datetime(cheetah_strava_innerjoin['time']).dt.strftime('%H:%M:%S')
sns.scatterplot(cheetah_strava_innerjoin['time'],cheetah_strava_innerjoin.index)
plt.show()
def skewness(x):
res = 0
m = x.mean()
s = x.std()
for i in x:
res += (i-m) * (i-m) * (i-m)
res /= ( len(x) * s * s * s)
return res
sns.distplot(cheetah_strava_innerjoin.moving_time)
print("Skewness of the moving_time = ",skewness(cheetah_strava_innerjoin.moving_time))
sns.distplot(cheetah_strava_innerjoin.elapsed_time)
print("Skewness of the elapsed_time = ",skewness(cheetah_strava_innerjoin.elapsed_time))
sns.distplot(cheetah_strava_innerjoin['Time Moving'])
print("Skewness of the Time Moving = ",skewness(cheetah_strava_innerjoin['Time Moving']))
sns.distplot(cheetah_strava_innerjoin.distance)
print("Skewness of the distance = ",skewness(cheetah_strava_innerjoin.distance))
sns.distplot(cheetah_strava_innerjoin['Average Power'])
print("Skewness of the Average Power = ",skewness(cheetah_strava_innerjoin['Average Power']))
cheetah_strava_innerjoin['average_watts'] = cheetah_strava_innerjoin['average_watts'].fillna(0.0)
sns.distplot(cheetah_strava_innerjoin.average_watts)
print("Skewness of the Average watts = ",skewness(cheetah_strava_innerjoin['average_watts']))
sns.distplot(cheetah_strava_innerjoin.TSS)
print("Skewness of the TSS = ",skewness(cheetah_strava_innerjoin['TSS']))
sns.distplot(cheetah_strava_innerjoin['Average Speed'])
print("Skewness of the Average Speed = ",skewness(cheetah_strava_innerjoin['Average Speed']))
C_S_I = cheetah_strava_innerjoin[["distance","moving_time","Average Speed","average_heartrate","Average Power","NP","TSS","Elevation Gain"]]
C_S_I.head()
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
sns.pairplot(C_S_I)
CORRELATION = C_S_I.corr()
CORRELATION
C_S_I1 = cheetah_strava_innerjoin[["distance","workout_type","moving_time","Average Speed","average_heartrate","Average Power","NP","TSS","Elevation Gain"]]
C_S_I1.head()
sns.catplot(x='workout_type',y='average_heartrate',kind='swarm',data=C_S_I1)
plt.grid(True)
sns.catplot(x='workout_type',y='average_heartrate',kind='box',data=C_S_I1)
plt.grid(True)
sns.catplot(x='workout_type',y='distance',kind='swarm',data=C_S_I1)
plt.grid(True)
sns.catplot(x='workout_type',y='distance',kind='box',data=C_S_I1)
plt.grid(True)
#hue='workout_type'
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
sns.pairplot(C_S_I1, hue = 'workout_type')
#what leads to more kudos
C_S_I2 = cheetah_strava_innerjoin[["workout_type","kudos","distance","moving_time"]]
sns.catplot(x="workout_type", y="kudos", hue="workout_type", kind="box", data=C_S_I2)
sns.catplot(x="workout_type", y="kudos", hue="workout_type", kind="bar", data=C_S_I2)
C_S_I1 = cheetah_strava_innerjoin[["distance","workout_type","kudos","moving_time","Average Speed","average_heartrate","Average Power","NP","TSS","Elevation Gain"]]
CORRELATION = C_S_I1.corr()
CORRELATION
sns.pairplot(C_S_I2, hue = 'workout_type')
CORRELATION2 = C_S_I2.corr()
CORRELATION2
suhash = pd.DataFrame()
suhash["date"] = cheetah_strava_innerjoin.date
suhash["distance"] = cheetah_strava_innerjoin.distance
suhash["TSS"] = cheetah_strava_innerjoin.TSS
suhash["AverageSpeed"] = cheetah_strava_innerjoin["Average Speed"]
suhash.head()
from datetime import datetime
from datetime import timedelta
suhash['date'] = pd.to_datetime(suhash['date'])
suhash.head()
suhash1 = suhash.reset_index(drop=True)
suhash1.head()
Distance_Month = suhash1.set_index('date').groupby(pd.Grouper(freq='M'))['distance','TSS'].sum()
Distance_Month["Averageg_Speed"] = suhash1.set_index('date').groupby(pd.Grouper(freq='M'))['AverageSpeed'].mean()
Distance_Month = Distance_Month.reset_index()
Distance_Month
plt.figure(figsize = (10,5))
plt.plot(Distance_Month["date"], Distance_Month["distance"], marker = '*')
plt.title("Distance Travelled by month")
import os
os.getcwd()
os.chdir("C:\\Users\\suhas\\Documents\\GitHub\\portfolio-2019-suhashimmareddy\\Appliances-energy-prediction-data")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pylab as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_selection import RFE
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
edc = pd.read_csv('energydata_complete.csv')
edc.head()
edc.shape
edc.describe()
edc.columns
edc.dtypes
edc1 = edc # just making sure that we do have a backup file of the original one
edc1['date'] = pd.to_datetime(edc1['date'])
len(edc1)
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
month = []
hours = []
week = []
Day = []
for i in range (len(edc1)):
month.append(edc1['date'][i].month_name())
hours.append(edc1['date'][i])
week.append(edc1['date'][i].week)
Day.append(edc1['date'][i].dayofweek)
a = set(month)
a
#Convert to Series
A=pd.Series(month)
B=pd.Series(hours).dt.floor("H") #floor the hour
C=pd.Series(week)
D=pd.Series(Day)
#adding the series to the dataframe
edc1['Month']=A
edc1['Hour']=B
edc1['week']=C
edc1['Day']=D
edc1['Time'] = edc1.Hour.dt.hour
edc1.columns
edc1.head()
edc1.shape
b = set(Day)
b
plt.figure(figsize=(20, 7))
sns.distplot(edc1.Appliances)
plt.figure(figsize=(20, 7))
plt.hist(edc1["Appliances"], bins='auto', color='#ff0000', alpha=0.7, rwidth=0.85)
plt.show()
plt.figure(figsize=(16, 6))
sns.set(style="whitegrid")
sns.boxplot(edc1["Appliances"])
# box plot of Appliances energy Consuption, grouped by month
plt.figure(figsize=(20, 7))
sns.boxplot(x="Appliances", y="Month", data=edc1)
plt.figure(figsize=(20, 7))
ax1 = sns.lineplot(data=edc1["Appliances"], color="coral", label="line")
ax1.set_ylabel('Appliances (Wh)');
plt.figure(figsize=(20, 7))
plt.ylabel('Appliances(wh)')
plt.xlabel('per month')
plt.plot(edc1.date,edc1.Appliances)
plt.xticks(rotation = -75)
week = edc1["date"].dt.week
first_week = edc1[week == min(week)]
first_week
plt.figure(figsize=(20,7))
plt.plot(first_week['date'],first_week['Appliances'])
energy1=pd.DataFrame() # making a new dataframe and appending alll required feilds to it that are required for the heat map
energy2 =edc1.groupby('Hour',as_index=False).agg({"Appliances": "sum"})
energy2.head()
energy1['Hour'] = edc1['Hour']
energy1['week'] = edc1['week']
energy1['Day'] = edc1['Day']
energy1['Time'] = edc1['Time']
energy1.head()
energy3 =energy1.groupby('Hour',as_index=False).first()
energy3.head()
energy1=pd.merge(energy3,energy2)
energy1.head()
week3=energy1[energy1.week == 3]
week3 = week3.drop(['Hour', 'week'], axis=1) #selecting only the required columns
week3.head()
week3 = week3.pivot("Time","Day","Appliances")
week4=energy1[energy1.week == 4]
week4 = week4.drop(['Hour', 'week'], axis=1)
week4 = week4.pivot("Time","Day","Appliances")
week5=energy1[energy1.week == 5]
week5 = week5.drop(['Hour', 'week'], axis=1)
week5 = week5.pivot("Time","Day","Appliances")
week6=energy1[energy1.week == 6]
week6 = week6.drop(['Hour', 'week'], axis=1)
week6 = week6.pivot("Time","Day","Appliances")
plt.figure(figsize=(4, 9))
plt.title("Week-3 Heat Map")
ax = sns.heatmap(week3,cmap = "YlOrRd",annot=True, fmt="d", linewidths=.5)
plt.figure(figsize=(4, 9))
plt.title("Week-4 Heat Map")
ax = sns.heatmap(week4,cmap = "YlOrRd",annot=True, fmt="d", linewidths=.5)
plt.figure(figsize=(4, 9))
plt.title("Week-5 Heat Map")
ax = sns.heatmap(week5,cmap = "YlOrRd",annot=True, fmt="d", linewidths=.5)
plt.figure(figsize=(4, 9))
plt.title("Week-6 Heat Map")
ax = sns.heatmap(week6,cmap = "YlOrRd",annot=True, fmt="d", linewidths=.5)
edc1.columns
## each variable distribution and the correlation value with respect appliances is given below
def corr(x, y, **kwargs):
coef = np.corrcoef(x, y)[0][1]
label = r'$\rho$ = ' + str(round(coef, 2))
ax = plt.gca()
ax.annotate(label, xy = (0.2, 0.95), size = 20, xycoords = ax.transAxes)
ss = edc1[['Appliances','lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3','RH_3' ]]
g = sns.pairplot(ss)
g.map_lower(corr)
g.map_upper(corr)
plt.show()
ss1 = edc1[['Appliances','T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7']]
g = sns.pairplot(ss1)
g.map_lower(corr)
g.map_upper(corr)
plt.show()
ss2 = edc1[['Appliances','RH_7', 'T8','RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg']]
g = sns.pairplot(ss2)
g.map_lower(corr)
g.map_upper(corr)
plt.show()
ss3 = edc1[['Appliances','RH_out', 'Windspeed','Visibility', 'Tdewpoint', 'rv1', 'rv2']]
g = sns.pairplot(ss3)
g.map_lower(corr)
g.map_upper(corr)
plt.show()
# model validation
train = pd.read_csv('training.csv', index_col='date', parse_dates=True)
test = pd.read_csv('testing.csv', index_col='date', parse_dates=True)
train = train.join(pd.get_dummies(train['Day_of_week']))
train.head()
train = train.join(pd.get_dummies(train['WeekStatus']))
train.head()
test = test.join(pd.get_dummies(test['Day_of_week']))
test.head()
test = test.join(pd.get_dummies(test['WeekStatus']))
test.head()
### traing the dataset with train dataset and validating against test dataset
c = train.columns
c = c.drop(['Day_of_week','WeekStatus','Weekend'])
c
X_train = train.drop(['Appliances','Day_of_week','WeekStatus'], axis = 1)
Y_train = train['Appliances']
X_test = test.drop(['Appliances','Day_of_week','WeekStatus'], axis = 1)
Y_test = test['Appliances']
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
### Fitting the linear model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
model = LinearRegression()
model.fit(X_train, Y_train)
def mean_absolute_percentage_error1(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
import math
Y_pred = model.predict(X_test)
print("r2 for test data is",r2_score(Y_test, Y_pred))
print("MSE for test data is",mean_squared_error(Y_test, Y_pred))
print("RMSE for test data is",math.sqrt(mean_squared_error(Y_test, Y_pred)))
print("MAE for test data is",mean_absolute_error(Y_test, Y_pred))
print("MAPE for test data is",mean_absolute_percentage_error1(Y_test, model.predict(X_test)))
residuals = Y_test - Y_pred
plt.figure(figsize = (10,5))
plt.scatter(test.Appliances,residuals)
plt.xlabel("Appliances")
plt.ylabel("residuals")
Y_pred1 = model.predict(X_train)
print("r2 for train data is",r2_score(Y_train, Y_pred1))
print("MSE for train data is",mean_squared_error(Y_train, Y_pred1))
print("RMSE for train data is",math.sqrt(mean_squared_error(Y_train, Y_pred1)))
print("MAE for train data is",mean_absolute_error(Y_train, Y_pred1))
print("MAPE for train data is",mean_absolute_percentage_error1(Y_train, model.predict(X_train)))
## Fitting the RFE
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
estimator = LinearRegression()
selector = RFE(estimator, 30, step=1)
selector = selector.fit(X_train, Y_train)
y_predict = selector.predict(X_test)
print("r2 for test data is",r2_score(Y_test, y_predict))
print("MSE for test data is",mean_squared_error(Y_test, y_predict))
print("RMSE for test data is",math.sqrt(mean_squared_error(Y_test, y_predict)))
print("MAE for test data is",mean_absolute_error(Y_test, y_predict))
print("MAPE for test data is",mean_absolute_percentage_error1(Y_test, y_predict))
residuals = Y_test - y_predict
plt.figure(figsize = (10,5))
plt.scatter(test.Appliances,residuals)
plt.xlabel("Appliances")
plt.ylabel("residuals")
Y_pred1 = selector.predict(X_train)
print("r2 for train data is",r2_score(Y_train, Y_pred1))
print("MSE for train data is",mean_squared_error(Y_train, Y_pred1))
print("RMSE for train data is",math.sqrt(mean_squared_error(Y_train, Y_pred1)))
print("MAE for train data is",mean_absolute_error(Y_train, Y_pred1))
print("MAPE for train data is",mean_absolute_percentage_error1(Y_train, selector.predict(X_train)))
## Fitting a Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)
print("r2 for test data is",r2_score(Y_test, Y_pred))
print("MSE for test data is",mean_squared_error(Y_test, Y_pred))
print("RMSE for test data is",math.sqrt(mean_squared_error(Y_test, Y_pred)))
print("MAE for test data is",mean_absolute_error(Y_test, Y_pred))
print("MAPE for test data is",mean_absolute_percentage_error1(Y_test, model.predict(X_test)))
Y_pred1 = regressor.predict(X_train)
print("r2 for train data is",r2_score(Y_train, Y_pred1))
print("MSE for train data is",mean_squared_error(Y_train, Y_pred1))
print("RMSE for train data is",math.sqrt(mean_squared_error(Y_train, Y_pred1)))
print("MAE for train data is",mean_absolute_error(Y_train, Y_pred1))
print("MAPE for train data is",mean_absolute_percentage_error1(Y_train, regressor.predict(X_train)))
import sys
import warnings
if not sys.warnoptions:
warnings.simplefilter("ignore")
estimator = RandomForestRegressor(n_estimators = 100, random_state = 0)
selector = RFE(estimator, 30, step=1)
selector = selector.fit(X_train, Y_train)
y_predict = selector.predict(X_test)
print("r2 for test data is",r2_score(Y_test, y_predict))
print("MSE for test data is",mean_squared_error(Y_test, y_predict))
print("RMSE for test data is",math.sqrt(mean_squared_error(Y_test, y_predict)))
print("MAE for test data is",mean_absolute_error(Y_test, y_predict))
print("MAPE for test data is",mean_absolute_percentage_error1(Y_test, y_predict))
Y_pred1 = selector.predict(X_train)
print("r2 for train data is",r2_score(Y_train, Y_pred1))
print("MSE for train data is",mean_squared_error(Y_train, Y_pred1))
print("RMSE for train data is",math.sqrt(mean_squared_error(Y_train, Y_pred1)))
print("MAE for train data is",mean_absolute_error(Y_train, Y_pred1))
print("MAPE for train data is",mean_absolute_percentage_error1(Y_train, selector.predict(X_train)))
ranks = {}
# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x,2), ranks)
return dict(zip(names, ranks))
lr = LinearRegression(normalize=True)
lr.fit(X_train,Y_train)
#stop the search when only the last feature is left
rfe = RFE(lr, n_features_to_select=1, verbose =3 )
rfe.fit(X_train,Y_train)
ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), c, order=-1)
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3)
rf.fit(X_train,Y_train)
ranks["RF"] = ranking(rf.feature_importances_, c);
r = {}
for name in c:
r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2)
methods = sorted(ranks.keys())
ranks["Mean"] = r
methods.append("Mean")
print("\t%s" % "\t".join(methods))
for name in c:
print("%s\t%s" % (name, "\t".join(map(str,
[ranks[method][name] for method in methods]))))
# Put the mean scores into a Pandas dataframe
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Mean Ranking'])
# Sort the dataframe
meanplot = meanplot.sort_values('Mean Ranking', ascending=False)
# Let's plot the ranking of the features
sns.factorplot(x="Mean Ranking", y="Feature", data = meanplot, kind="bar",
size=14, aspect=1.9, palette='coolwarm')
K-means clustering is one of the simplest and popular unsupervised learning algorithms. Typically, unsupervised algorithms make inferences from datasets using only input vectors without referring to known, or labelled, outcomes. This notebook illustrates the process of K-means clustering by generating some random clusters of data and then showing the iterations of the algorithm as random cluster means are updated.
We first generate random data around 4 centers.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
center_1 = np.array([1,2])
center_2 = np.array([6,6])
center_3 = np.array([9,1])
center_4 = np.array([-5,-1])
# Generate random data and center it to the four centers each with a different variance
np.random.seed(5)
data_1 = np.random.randn(200,2) * 1.5 + center_1
data_2 = np.random.randn(200,2) * 1 + center_2
data_3 = np.random.randn(200,2) * 0.5 + center_3
data_4 = np.random.randn(200,2) * 0.8 + center_4
data = np.concatenate((data_1, data_2, data_3, data_4), axis = 0)
plt.scatter(data[:,0], data[:,1], s=7, c='red')
plt.show()
You need to generate four random centres.
This part of portfolio should contain at least:
k is set to 4;centres = np.random.randn(k,c)*std + mean where std and mean are the standard deviation and mean of the data. c represents the number of features in the data. Set the random seed to 6.green, blue, yellow, and cyan. Set the edgecolors to red.std = data.std()
mean = data.mean()
np.random.seed(5)
k = 4
ss = ['green', 'blue', 'yellow', 'cyan']
centres = np.random.randn(k,2)*std + mean ### centroids
plt.scatter(data[:,0], data[:,1], s=7, c='black')
plt.scatter(centres[:,0], centres[:,1],s=100, c=ss)
plt.show()
print(data)
d1 = pd.DataFrame({
'X_value': data[:, 0],
'Y_value': data[:, -1]
})
print(d1.head())
print(centres)
You need to implement the process of k-means clustering. Implement each iteration as a seperate cell, assigning each data point to the closest centre, then updating the cluster centres based on the data, then plot the new clusters.
Replace this text with your explaination of the algorithm. The resulting notebook should provide a good explanation and demonstration of the K-means algorithm.
centroids = {
1: [centres[0][0], centres[0][1]], 2: [centres[1][0], centres[1][1]], 3: [centres[2][0], centres[2][1]], 4: [centres[3][0], centres[3][1]]
}
# Assignment Stage
def assignment(df, centroids):
for i in centroids.keys():
# sqrt((x1 - x2)^2 - (y1 - y2)^2)
df['distance_from_{}'.format(i)] = (
np.sqrt(
(df['X_value'] - centroids[i][0]) ** 2
+ (df['Y_value'] - centroids[i][1]) ** 2
)
)
colmap = {1: 'green', 2: 'blue', 3: 'yellow', 4: 'cyan'}
centroid_distance_cols = ['distance_from_{}'.format(i) for i in centroids.keys()]
df['closest'] = df.loc[:, centroid_distance_cols].idxmin(axis=1)
df['closest'] = df['closest'].map(lambda x: float(x.lstrip('distance_from_')))
df['color'] = df['closest'].map(lambda x: colmap[x])
return df
df = assignment(d1, centroids)
print(df.head())
sss =[]
for i in range(0,800):
sss.append(df['color'][i])
fig = plt.figure(figsize=(5, 5))
plt.scatter(df['X_value'], df['Y_value'], color=df['color'], alpha=0.5, edgecolor='k')
colmap = {1: 'green', 2: 'blue', 3: 'yellow', 4: 'cyan'}
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i])
#update stage
import copy
old_centroids = copy.deepcopy(centroids)
def update(k):
for i in centroids.keys():
centroids[i][0] = np.mean(df[df['closest'] == i]['X_value'])
centroids[i][1] = np.mean(df[df['closest'] == i]['Y_value'])
return k
centroids = update(centroids)
suhash = 2
fig = plt.figure(figsize=(20, 10))
ax = plt.axes()
plt.scatter(df['X_value'], df['Y_value'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i])
for i in old_centroids.keys():
old_x = old_centroids[i][0]
old_y = old_centroids[i][1]
dx = (centroids[i][0] - old_centroids[i][0]) * 0.75
dy = (centroids[i][1] - old_centroids[i][1]) * 0.75
ax.arrow(old_x, old_y, dx, dy, head_width=0, head_length=0.75, fc=colmap[i], ec=colmap[i])
plt.show()
#Repeat Assignment Stage
df = assignment(df, centroids)
# Plot results
fig = plt.figure(figsize=(20, 10))
plt.scatter(df['X_value'], df['Y_value'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i])
plt.show()
# Continue until all assigned categories don't change any more
while True:
closest_centroids = df['closest'].copy(deep=True)
centroids = update(centroids)
df = assignment(df, centroids)
suhash += 1
if closest_centroids.equals(df['closest']):
break
fig = plt.figure(figsize=(20, 10))
plt.scatter(df['X_value'], df['Y_value'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colmap[i], s=100)
plt.show()
print("\n")
print("It took " + str(suhash) + " Itterations in order to get the centroids position to be constant")
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
ssss = ["cyan","blue","green", "yellow"]
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(data)
plt.scatter(data[:,0], data[:,1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c=ssss)
plt.show()
import json
import os
notebooks_to_merge = [file for file in os.listdir(os.getcwd()) if file.endswith('.ipynb')]
notebooks_to_merge.sort()
print(notebooks_to_merge)
def combine_ipynb_files(list_of_notebooks, combined_file_name):
with open (notebooks_to_merge[0], mode = 'r', encoding = 'utf-8') as f:
a = json.load (f)
for notebook in notebooks_to_merge[1:]:
with open (notebook, mode = 'r', encoding = 'utf-8') as f:
b = json.load(f)
a['cells'].extend (b['cells'])
with open(combined_file_name, mode='w', encoding='utf-8') as f:
json.dump(a, f)
print('Generated file: \"{}\".'.format(combined_file_name))
return (os.path.realpath(combined_file_name))
combine_ipynb_files(notebooks_to_merge, "merged.ipynb")